In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
pd.__version__ # need 0.14.0 for multiindex slicing
Out[1]:
In [2]:
oe = pd.read_table("overall_statistics_3d.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack().loc(axis=0)[6:10,160:200]
ve = pd.read_table("variable_statistics_3d.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack().loc(axis=0)[6:10,160:200]
In [3]:
oi = pd.read_table("overall_statistics_ksmall.txt").set_index(["K","M","STATISTIC"])["VALUE"].unstack().loc(axis=0)[6:10,160:200]
vi = pd.read_table("variable_statistics_ksmall.txt").set_index(["K","M","STATISTIC","VARIABLE"])["VALUE"].unstack().unstack().loc(axis=0)[6:10,160:200]
In [4]:
N_c = 88*30 # for 3D variables, vertical stacking
N_d = 48602 # for 3D variables, vertical stacking
original_size = N_c * N_d
compressed_size = lambda K, M: N_d + N_c * K + N_d * M + N_c * K * M
oe["compression_ratio_fixed"] = compressed_size(np.array(oe.index.get_level_values("K")),np.array(oe.index.get_level_values("M"))) / original_size
#oe.loc[:,"compression_ratio_fixed"].unstack("K")
In [5]:
N_c = 3008 # for all variables, vertical stacking
N_d = 48602 # for all variables, vertical stacking
original_size = N_c * N_d
compressed_size = lambda K, M: N_d + N_c * K + N_d * M + N_c * K * M
oi["compression_ratio_fixed"] = compressed_size(np.array(oi.index.get_level_values("K")),np.array(oi.index.get_level_values("M"))) / original_size
#oi.loc[:,"compression_ratio_fixed"].unstack("K")
In [24]:
# rms error vs compression ratio, one line per K
grouped_e = ve.loc(axis=0)[7:10,:].mean(axis=1,level="STATISTIC").join(oe).reset_index().groupby("K")
grouped_i = vi.loc(axis=0)[7:10,:].mean(axis=1,level="STATISTIC").join(oi).reset_index().groupby("K")
for key,grp in grouped_e:
plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],"--", label="K = " + str(key) + " (excl.)")
for key,grp in grouped_i:
plt.plot(grp["compression_ratio_fixed"],grp["rms_error"],"-", label="K = " + str(key) + " (incl.)")
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean rms error")
#plt.title("error vs compression ratio, by K")
#plt.xlim((0.07,0.15))
plt.ylim((0.001,0.0025))
Out[24]:
In [26]:
# max error vs compression ratio, one line per K
grouped_e = ve.loc(axis=0)[7:10,:].mean(axis=1,level="STATISTIC").join(oe).reset_index().groupby("K")
grouped_i = vi.loc(axis=0)[7:10,:].mean(axis=1,level="STATISTIC").join(oi).reset_index().groupby("K")
for key,grp in grouped_e:
plt.plot(grp["compression_ratio_fixed"],grp["max_error"],"--", label="K = " + str(key) + " (excl.)")
for key,grp in grouped_i:
plt.plot(grp["compression_ratio_fixed"],grp["max_error"],"-", label="K = " + str(key) + " (incl.)")
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("mean max error")
#plt.title("error vs compression ratio, by K")
plt.xlim((0.075,0.14))
plt.ylim((0.035,0.085))
Out[26]:
In [7]:
for key,grp in grouped_e:
plt.plot(grp["compression_ratio_fixed"],grp["L_final"],"-", label="K = " + str(key) + " (excl.)")
for key,grp in grouped_i:
plt.plot(grp["compression_ratio_fixed"],grp["L_final"],":", label="K = " + str(key) + " (incl.)")
plt.legend()
plt.xlabel("compression ratio")
plt.ylabel("final L value")
#plt.title("error vs compression ratio, by K")
plt.xlim((0.07,0.15))
#plt.ylim((0.001,0.002))
Out[7]:
In [8]:
# load variable information for joining levels to variables
v_info = pd.read_table("variable_information.txt").set_index(["VARIABLE","INFO"]).unstack().loc[:,"VALUE"]
v_info["levels"] = v_info["levels"].astype("int")
v_info.columns.name = ""
In [9]:
v_ranked_rms = vi.mean(axis=0).unstack().sort("rms_error", ascending=False).join(v_info)["levels"].reset_index().reset_index()
v_ranked_max = vi.mean(axis=0).unstack().sort("max_error", ascending=False).join(v_info)["levels"].reset_index().reset_index()
In [10]:
v_ranked_rms[v_ranked_rms.levels < 30]["index"].hist(bins=range(0,220,20))
plt.xlabel("rank when ordered by rms error (larger is better)")
plt.ylabel("number of variables")
plt.xlim((0,200))
plt.ylim((0,20))
print("Percentage in upper half:", 100*len(v_ranked_rms[(v_ranked_rms.levels < 30) & (v_ranked_rms.index >= 93)]) / len(v_ranked_rms[v_ranked_rms.levels < 30]))
In [11]:
v_ranked_max[v_ranked_max.levels < 30]["index"].hist(bins=range(0,220,20))
plt.xlabel("rank when ordered by maximum error (larger is better)")
plt.ylabel("number of variables")
plt.xlim((0,200))
plt.ylim((0,20))
print("Percentage in upper half:", 100*len(v_ranked_max[(v_ranked_max.levels < 30) & (v_ranked_max.index >= 93)]) / len(v_ranked_max[v_ranked_max.levels < 30]))